Fechar

@Article{MenesesNiZheMenKal:2015:UsMiOb,
               author = "Meneses, Esteban and Ni, Xiang and Zheng, Gengbin and Mendes, 
                         Celso Luiz and Kale, Laxmikant V.",
          affiliation = "{University of Pittsburgh} and {University of Illinois} and 
                         {National Center for Supercomputing Applications} and {Instituto 
                         Nacional de Pesquisas Espaciais (INPE)} and {University of 
                         Illinois}",
                title = "Using migratable objects to enhance fault tolerance schemes in 
                         supercomputers",
              journal = "IEEE Transactions on Parallel and Distributed Systems",
                 year = "2015",
               volume = "26",
               number = "7",
                pages = "2061--2074",
             keywords = "Migratable objects, fault tolerance, resilience, 
                         checkpoint/restart, message logging.",
             abstract = "Supercomputers have seen an exponential increase in their size in 
                         the last two decades. Such a high growth rate is expected to take 
                         us to exascale in the timeframe 2018-2022. But, to bring a 
                         productive exascale environment about, it is necessary to focus on 
                         several key challenges. One of those challenges is fault 
                         tolerance. Machines at extreme scale will experience frequent 
                         failures and will require the system to avoid or overcome those 
                         failures. Various techniques have recently been developed to 
                         tolerate failures. The impact of these techniques and their 
                         scalability can be substantially enhanced by a parallel 
                         programming model called migratable objects. In this paper, we 
                         demonstrate how the migratable-objects model facilitates and 
                         improves several fault tolerance approaches. Our experimental 
                         results on thousands of cores suggest fault tolerance schemes 
                         based on migratable objects have low performance overhead and high 
                         scalability. Additionally, we present a performance model that 
                         predicts a significant benefit of using migratable objects to 
                         provide fault tolerance at extreme scale.",
                  doi = "10.1109/TPDS.2014.2342228",
                  url = "http://dx.doi.org/10.1109/TPDS.2014.2342228",
                 issn = "1045-9219",
                label = "lattes: 9051364483671452 4 MenesesNiZheMenKal:2015:UsMiOb",
             language = "pt",
           targetfile = "1_meneses.pdf",
        urlaccessdate = "27 abr. 2024"
}


Fechar